rm(list = ls())
nc_sub <- read.csv("data/nc_random_forest.csv")

classify <- function(df) {
  
  eth <- c("whi", "bla", "his", "asi", "oth")
  
  # Create dummy  for each race, where 1 indicates race with highest probability and all other dummies are 0
  for (k in 1:5) {
    df[paste("pred", eth[k], "class", sep = ".")] <- ifelse(is.na(df[paste("pred", eth[k], sep = ".")]), NA, 0)
    df[, paste("pred", eth[k], "class", sep = ".")] <- 
      ifelse(df[paste("pred", eth[k], sep = ".")] > df[paste("pred", eth[-k][1], sep = ".")] & 
               df[paste("pred", eth[k], sep = ".")] > df[paste("pred", eth[-k][2], sep = ".")] & 
               df[paste("pred", eth[k], sep = ".")] > df[paste("pred", eth[-k][3], sep = ".")] & 
               df[paste("pred", eth[k], sep = ".")] > df[paste("pred", eth[-k][4], sep = ".")], 
             1, df[, paste("pred", eth[k], "class", sep = ".")])
  }
  return(df)
}

nc_preds <- subset(nc_sub, select = c("pred.whi", "pred.bla", "pred.lat", "pred.asi", "pred.oth"))
nc_preds$pred.his <- nc_preds$pred.lat
nc_preds$pred.lat <- NULL

nc_test <- classify(nc_preds)

nc_sub$predicted.white <- as.numeric(nc_test$pred.whi.class)
nc_sub$predicted.black <- as.numeric(nc_test$pred.bla.class)
nc_sub$predicted.latino <- as.numeric(nc_test$pred.his.class)
nc_sub$predicted.asian <- as.numeric(nc_test$pred.asi.class)
nc_sub$predicted.other <- as.numeric(nc_test$pred.oth.class)

nc_sub$predicted.race = ""
nc_sub$predicted.race[nc_sub$predicted.white == 1] <- "White"
nc_sub$predicted.race[nc_sub$predicted.black == 1] <- "Black"
nc_sub$predicted.race[nc_sub$predicted.latino == 1] <- "Hispanic"
nc_sub$predicted.race[nc_sub$predicted.asian == 1] <- "Asian"
nc_sub$predicted.race[nc_sub$predicted.other == 1] <- "Other"

nc_sub$predicted.white.rf <- ifelse(nc_sub$rf.race == "White", 1, 0)
nc_sub$predicted.black.rf <- ifelse(nc_sub$rf.race == "Black", 1, 0)
nc_sub$predicted.latino.rf <- ifelse(nc_sub$rf.race == "Hispanic", 1, 0)
nc_sub$predicted.asian.rf <- ifelse(nc_sub$rf.race == "Asian", 1, 0)
nc_sub$predicted.other.rf <- ifelse(nc_sub$rf.race == "Other", 1, 0)


#wrong prediction and income with all/rf data
nc_sub$wrong.race <- ifelse(nc_sub$sr.race != nc_sub$predicted.race, 1, 0)
nc_sub$wrong.race.rf <- ifelse(nc_sub$sr.race != nc_sub$rf.race, 1, 0)
nc_sub$sr.race.factor <- as.factor(nc_sub$sr.race)
nc_sub$income_rounded <- as.numeric(nc_sub$income_rounded)

##############################################
#TABLE 1 - random forest overall error rate
round(mean(nc_sub$wrong.race.rf, na.rm = T), 3)
#0.148
nc_sub$rf.race.factor <- as.factor(nc_sub$rf.race)
#TABLE 1 - F1-score
library(yardstick)
f_meas(nc_sub, truth = sr.race.factor, estimate = rf.race.factor)
#0.587

#TABLE 1 - false negative - i.e.  classifying a latino as non-latino
round(mean(nc_sub$wrong.race.rf[nc_sub$sr.race == "White"], na.rm = T), 3)
#0.083
round(mean(nc_sub$wrong.race.rf[nc_sub$sr.race == "Black"], na.rm = T), 3)
#0.271
round(mean(nc_sub$wrong.race.rf[nc_sub$sr.race == "Hispanic"], na.rm = T), 3)
#0.226
round(mean(nc_sub$wrong.race.rf[nc_sub$sr.race == "Asian"], na.rm = T), 3)
#0.348
round(mean(nc_sub$wrong.race.rf[nc_sub$sr.race == "Other"], na.rm = T), 3)
#0.992

#TABLE 1 - false positive - i.e.  classifying a non-white as white
round(mean(nc_sub$predicted.white.rf[nc_sub$sr.race != "White"], na.rm = T), 3)
#0.255
round(mean(nc_sub$predicted.black.rf[nc_sub$sr.race != "Black"], na.rm = T), 3)
#0.065
round(mean(nc_sub$predicted.latino.rf[nc_sub$sr.race != "Hispanic" ], na.rm = T), 3)
#0.017
round(mean(nc_sub$predicted.asian.rf[nc_sub$sr.race != "Asian"], na.rm = T), 3)
#0.007
round(mean(nc_sub$predicted.other.rf[nc_sub$sr.race != "Other"], na.rm = T), 3)
#0.001


################################
###########FIGURE 2#############
################################

nc_black <- subset(nc_sub, sr.race == "Black")
nc_white <- subset(nc_sub, sr.race == "White")
nc_latino <- subset(nc_sub, sr.race == "Hispanic")
nc_asian <- subset(nc_sub, sr.race == "Asian")

#BLACK
black.wrong.race <- tapply(X = nc_black$wrong.race, INDEX = nc_black$income_rounded, function(x) mean(x, na.rm = T))
black.wrong.race.forest <- tapply(X = nc_black$wrong.race.rf, INDEX = nc_black$income_rounded, function(x) mean(x, na.rm = T))
point.size <- tapply(X = nc_black$wrong.race, INDEX = nc_black$income_rounded, length)

m <- loess(black.wrong.race ~ names(black.wrong.race), weights = point.size, span = .6)
m.forest <- loess(black.wrong.race.forest ~ names(black.wrong.race.forest), weights = point.size, span = .6)

plot(names(black.wrong.race), black.wrong.race, ylim = c(0, 1), xlab = "Median Income of Census Tract ($1,000)", main = "Black Voters", ylab = "Misclassification Rate", pch = 16, cex = 0, col = "#99999999", axes = F, xlim = c(0, 220000))
axis(side = 1, labels = seq(0, 220, 20), at = seq(0, 220000, 20000), cex.axis = .9)
axis(side = 2, labels = seq(0, 1, .1), at = seq(0, 1, .1), las = 2)
lines(x=m$x, y=m$fitted, lwd = 3, col = "dark red")
lines(x=m.forest$x, y=m.forest$fitted, lwd = 5, col = "dark green", lty = 2)
box()
text(80000, .9, "BISG", pos = 4)
text(80000, .4, "BISG + Random Forest", pos = 4)

#LATINO
latino.wrong.race <- tapply(X = nc_latino$wrong.race, INDEX = nc_latino$income_rounded, function(x) mean(x, na.rm = T))
latino.wrong.race.forest <- tapply(X = nc_latino$wrong.race.rf, INDEX = nc_latino$income_rounded, function(x) mean(x, na.rm = T))
point.size <- tapply(X = nc_latino$wrong.race, INDEX = nc_latino$income_rounded, length)

m <- loess(latino.wrong.race ~ names(latino.wrong.race), weights = point.size, span = .6)
m.forest <- loess(latino.wrong.race.forest ~ names(latino.wrong.race.forest), weights = point.size, span = .6)

plot(names(latino.wrong.race), latino.wrong.race, ylim = c(0, 1), xlab = "Median Income of Census Tract ($1,000)", main = "Hispanic Voters", ylab = "Misclassification Rate", pch = 16, cex = 0, col = "#99999999", axes = F, xlim = c(0, 220000))
axis(side = 1, labels = seq(0, 220, 20), at = seq(0, 220000, 20000), cex.axis = .9)
axis(side = 2, labels = seq(0, 1, .1), at = seq(0, 1, .1), las = 2)
lines(x=m$x, y=m$fitted, lwd = 3, col = "dark red")
lines(x=m.forest$x, y=m.forest$fitted, lwd = 5, col = "dark green", lty = 2)
box()

#ASIAN
asian.wrong.race <- tapply(X = nc_asian$wrong.race, INDEX = nc_asian$income_rounded, function(x) mean(x, na.rm = T))
asian.wrong.race.forest <- tapply(X = nc_asian$wrong.race.rf, INDEX = nc_asian$income_rounded, function(x) mean(x, na.rm = T))
point.size <- tapply(X = nc_asian$wrong.race, INDEX = nc_asian$income_rounded, length)

m <- loess(asian.wrong.race ~ names(asian.wrong.race), weights = point.size, span = .6)
m.forest <- loess(asian.wrong.race.forest ~ names(asian.wrong.race.forest), weights = point.size, span = .6)

plot(names(asian.wrong.race), asian.wrong.race, ylim = c(0, 1), xlab = "Median Income of Census Tract ($1,000)", main = "Asian Voters", ylab = "Misclassification Rate", pch = 16, cex = 0, col = "#99999999", axes = F, xlim = c(0, 220000))
axis(side = 1, labels = seq(0, 220, 20), at = seq(0, 220000, 20000), cex.axis = .9)
axis(side = 2, labels = seq(0, 1, .1), at = seq(0, 1, .1), las = 2)
lines(x=m$x, y=m$fitted, lwd = 3, col = "dark red")
lines(x=m.forest$x, y=m.forest$fitted, lwd = 5, col = "dark green", lty = 2)
box()

#WHITE
white.wrong.race <- tapply(X = nc_white$wrong.race, INDEX = nc_white$income_rounded, function(x) mean(x, na.rm = T))
white.wrong.race.forest <- tapply(X = nc_white$wrong.race.rf, INDEX = nc_white$income_rounded, function(x) mean(x, na.rm = T))

point.size <- tapply(X = nc_white$wrong.race, INDEX = nc_white$income_rounded, length)
m <- loess(white.wrong.race ~ names(white.wrong.race), weights = point.size, span = .6)
m.forest <- loess(white.wrong.race.forest ~ names(white.wrong.race.forest), weights = point.size, span = .6)

plot(names(white.wrong.race), white.wrong.race, ylim = c(0, 1), xlab = "Median Income of Census Tract ($1,000)", main = "White Voters", ylab = "Misclassification Rate", pch = 16, cex = 0, col = "#99999999", axes = F, xlim = c(0, 220000))
axis(side = 1, labels = seq(0, 220, 20), at = seq(0, 220000, 20000), cex.axis = .9)
axis(side = 2, labels = seq(0, 1, .1), at = seq(0, 1, .1), las = 2)
lines(x=m$x, y=m$fitted, lwd = 3, col = "dark red")
lines(x=m.forest$x, y=m.forest$fitted, lwd = 5, col = "dark green", lty = 2)
box()


#############################
#######FIGURE 3##############
#############################

#create dummies for race variables
nc_sub$sr.white <- ifelse(nc_sub$sr.race == "White", 1, 0)
nc_sub$sr.black <- ifelse(nc_sub$sr.race == "Black", 1, 0)
nc_sub$sr.hispanic <- ifelse(nc_sub$sr.race == "Hispanic", 1, 0)
nc_sub$sr.asian <- ifelse(nc_sub$sr.race == "Asian", 1, 0)
nc_sub$sr.other <- ifelse(nc_sub$sr.race == "Other", 1, 0)

#tract-level aggregate variables
tract_pct_black_sr <- tapply(X = nc_sub$sr.black, INDEX = nc_sub$GEO_ID, FUN = function(x) mean(x, na.rm = T))
m <- match(nc_sub$GEO_ID, names(tract_pct_black_sr))
table(is.na(m))
nc_sub$tract_pct_black_sr <- tract_pct_black_sr[m]

tract_pct_black_bisg <- tapply(X = nc_sub$predicted.black, INDEX = nc_sub$GEO_ID, FUN = function(x) mean(x, na.rm = T))
m <- match(nc_sub$GEO_ID, names(tract_pct_black_bisg))
table(is.na(m))
nc_sub$tract_pct_black_bisg <- tract_pct_black_bisg[m]

tract_pct_black_rf <- tapply(X = nc_sub$predicted.black.rf, INDEX = nc_sub$GEO_ID, FUN = function(x) mean(x, na.rm = T))
m <- match(nc_sub$GEO_ID, names(tract_pct_black_rf))
table(is.na(m))
nc_sub$tract_pct_black_rf <- tract_pct_black_rf[m]


#turnout rate among Blacks by income category
#tracts located in each income category
nc_sub <- nc_sub[order(nc_sub$GEO_ID),]
nc_sub$N_cvap <- unlist(tapply(nc_sub$cvap.black, nc_sub$GEO_ID, function(x) 1:length(x)))

cvap.black.by.income <- tapply(X=nc_sub$cvap.black[nc_sub$N_cvap == 1], INDEX = nc_sub$income_rounded2[nc_sub$N_cvap == 1], FUN = function(x) sum(x, na.rm = T))

cvap.black.by.income2 <- c(sum(cvap.black.by.income[1], cvap.black.by.income[2]), cvap.black.by.income[3:12], sum(cvap.black.by.income[13:length(cvap.black.by.income)]))

#number of black voters by income category (self-reported)
black.votes.by.income <- tapply(X=nc_sub$voted.16g[nc_sub$sr.black == 1], INDEX = nc_sub$income_rounded2[nc_sub$sr.black == 1], FUN = function(x) sum(x, na.rm = T))

black.votes.by.income2 <- c(sum(black.votes.by.income[1], black.votes.by.income[2]), black.votes.by.income[3:12], sum(black.votes.by.income[13:length(black.votes.by.income)]))

#self-reported race, black turnout by income category
sr.black.turnout.by.income <- black.votes.by.income2/cvap.black.by.income2

#number of black voters by income category (bisg-model)
black.votes.by.income.bisg <- tapply(X=nc_sub$voted.16g[nc_sub$predicted.black == 1], INDEX = nc_sub$income_rounded2[nc_sub$predicted.black == 1], FUN = function(x) sum(x, na.rm = T))

black.votes.by.income.bisg2 <- c(sum(black.votes.by.income.bisg[1], black.votes.by.income.bisg[2]), black.votes.by.income.bisg[3:12], sum(black.votes.by.income.bisg[13:length(black.votes.by.income.bisg)]))

#self-reported race, black turnout by income category
sr.black.turnout.by.income.bisg <- black.votes.by.income.bisg2/cvap.black.by.income2

#number of black voters by income category (bisg-model)
black.votes.by.income.rf <- tapply(X=nc_sub$voted.16g[nc_sub$predicted.black.rf == 1], INDEX = nc_sub$income_rounded2[nc_sub$predicted.black.rf == 1], FUN = function(x) sum(x, na.rm = T))

black.votes.by.income.rf2 <- c(sum(black.votes.by.income.rf[1], black.votes.by.income.rf[2]), black.votes.by.income.rf[3:12], sum(black.votes.by.income.rf[13:length(black.votes.by.income.rf)]))

#self-reported race, black turnout by income category
sr.black.turnout.by.income.rf <- black.votes.by.income.rf2/cvap.black.by.income2

#overall turnout rate
#cvap black voters
black.voters.cvap <- sum(cvap.black$cvap_est[substr(cvap.black$GEO_ID, 1, 2) == "37"])
#number of black voters, self-reported
black.voters.sr <- sum(nc_sub$voted.16g[nc_sub$sr.black == 1])
#number of black voters, BISG
black.voters.bisg <- sum(nc_sub$voted.16g[nc_sub$predicted.black == 1])
#number of black voters, BISG+RF
black.voters.rf <- sum(nc_sub$voted.16g[nc_sub$predicted.black.rf == 1])

#self-reported race, turnout rate 
overall.sr <- black.voters.sr/black.voters.cvap
overall.bisg <- black.voters.bisg/black.voters.cvap
overall.rf <- black.voters.rf/black.voters.cvap


plot(seq(20, 130, 10), sr.black.turnout.by.income, ylim = c(0, .8), pch = 16, cex = 1.5, xlab= "Census Tract Income (in $1,000s)", ylab = "2016 Turnout Rate", axes = F, main = "Black Voters", xlim = c(0, 155))
points(seq(20, 130, 10), sr.black.turnout.by.income.bisg, pch = 15, cex = 1.5)
points(seq(20, 130, 10), sr.black.turnout.by.income.rf, pch = 17, cex = 1.5)
lines(seq(20, 130, 10), sr.black.turnout.by.income)
lines(seq(20, 130, 10), sr.black.turnout.by.income.bisg)
lines(seq(20, 130, 10), sr.black.turnout.by.income.rf)
axis(side = 1, at = seq(20, 130, 10), cex.axis = .8)
axis(side = 2, at = seq(0, 1, .1), las = 2)
box()
axis(side = 1, at = 0, "Overall")
points(0, overall.sr, pch = 16, cex = 1.6)
points(0, overall.bisg, pch = 15, cex = 1.6)
points(0, overall.rf, pch = 17, cex = 1.6)

text(130, sr.black.turnout.by.income[12], "Self-Reported", pos = 4)
text(130, sr.black.turnout.by.income.bisg[12], "BISG", pos = 4)
text(130, sr.black.turnout.by.income.rf[12], "BISG+\nRandom Forest", pos = 4)


#turnout rate among Whites by income category
#tracts located in each income category
nc_sub <- nc_sub[order(nc_sub$GEO_ID),]
nc_sub$N_cvap <- unlist(tapply(nc_sub$cvap.white, nc_sub$GEO_ID, function(x) 1:length(x)))

cvap.white.by.income <- tapply(X=nc_sub$cvap.white[nc_sub$N_cvap == 1], INDEX = nc_sub$income_rounded2[nc_sub$N_cvap == 1], FUN = function(x) sum(x, na.rm = T))

cvap.white.by.income2 <- c(sum(cvap.white.by.income[1], cvap.white.by.income[2]), cvap.white.by.income[3:12], sum(cvap.white.by.income[13:length(cvap.white.by.income)]))

#number of white voters by income category (self-reported)
white.votes.by.income <- tapply(X=nc_sub$voted.16g[nc_sub$sr.white == 1], INDEX = nc_sub$income_rounded2[nc_sub$sr.white == 1], FUN = function(x) sum(x, na.rm = T))

white.votes.by.income2 <- c(sum(white.votes.by.income[1], white.votes.by.income[2]), white.votes.by.income[3:12], sum(white.votes.by.income[13:length(white.votes.by.income)]))

#self-reported race, white turnout by income category
sr.white.turnout.by.income <- white.votes.by.income2/cvap.white.by.income2

#number of white voters by income category (bisg-model)
white.votes.by.income.bisg <- tapply(X=nc_sub$voted.16g[nc_sub$predicted.white == 1], INDEX = nc_sub$income_rounded2[nc_sub$predicted.white == 1], FUN = function(x) sum(x, na.rm = T))

white.votes.by.income.bisg2 <- c(sum(white.votes.by.income.bisg[1], white.votes.by.income.bisg[2]), white.votes.by.income.bisg[3:12], sum(white.votes.by.income.bisg[13:length(white.votes.by.income.bisg)]))

#self-reported race, white turnout by income category
sr.white.turnout.by.income.bisg <- white.votes.by.income.bisg2/cvap.white.by.income2

#number of white voters by income category (bisg-model)
white.votes.by.income.rf <- tapply(X=nc_sub$voted.16g[nc_sub$predicted.white.rf == 1], INDEX = nc_sub$income_rounded2[nc_sub$predicted.white.rf == 1], FUN = function(x) sum(x, na.rm = T))

white.votes.by.income.rf2 <- c(sum(white.votes.by.income.rf[1], white.votes.by.income.rf[2]), white.votes.by.income.rf[3:12], sum(white.votes.by.income.rf[13:length(white.votes.by.income.rf)]))

#self-reported race, white turnout by income category
sr.white.turnout.by.income.rf <- white.votes.by.income.rf2/cvap.white.by.income2

#overall turnout rate
#cvap black voters
white.voters.cvap <- sum(cvap.white$cvap_est[substr(cvap.black$GEO_ID, 1, 2) == "37"])
#number of black voters, self-reported
white.voters.sr <- sum(nc_sub$voted.16g[nc_sub$sr.white == 1])
#number of black voters, BISG
white.voters.bisg <- sum(nc_sub$voted.16g[nc_sub$predicted.white == 1])
#number of black voters, BISG+RF
white.voters.rf <- sum(nc_sub$voted.16g[nc_sub$predicted.white.rf == 1])

#self-reported race, turnout rate 
overall.sr.white <- white.voters.sr/white.voters.cvap
overall.bisg.white <- white.voters.bisg/white.voters.cvap
overall.rf.white <- white.voters.rf/white.voters.cvap


plot(seq(20, 130, 10), sr.white.turnout.by.income, ylim = c(.3, .95), pch = 16, cex = 1.5, xlab= "Census Tract Income (in $1,000s)", ylab = "2016 Turnout Rate", axes = F, main = "White Voters", xlim = c(0, 155))
points(seq(20, 130, 10), sr.white.turnout.by.income.bisg, pch = 15, cex = 1.5)
points(seq(20, 130, 10), sr.white.turnout.by.income.rf, pch = 17, cex = 1.5)
lines(seq(20, 130, 10), sr.white.turnout.by.income)
lines(seq(20, 130, 10), sr.white.turnout.by.income.bisg)
lines(seq(20, 130, 10), sr.white.turnout.by.income.rf)
axis(side = 1, at = seq(20, 130, 10), cex.axis = .8)
axis(side = 2, at = seq(0, 1, .1), las = 2)
box()
text(130, sr.white.turnout.by.income[12]-.02, "Self-Reported", pos = 4)
text(130, sr.white.turnout.by.income.bisg[12]+.02, "BISG", pos = 4)
text(130, sr.white.turnout.by.income.rf[12], "BISG+RForest", pos = 4)
points(0, overall.sr.white, pch = 16, cex = 1.6)
points(0, overall.bisg.white, pch = 15, cex = 1.6)
points(0, overall.rf.white, pch = 17, cex = 1.6)
axis(side = 1, at = 0, "Overall")





#############################
###########TABLE 2###########
#############################

#############################
#INCOME
#############################
#get income data
economic <- read.csv("data/nc_census_tract_income.csv")
economic$GEO_ID <- substring(economic$GEO_ID, 10)
m <- match(nc_sub$GEO_ID, economic$GEO_ID)
table(is.na(m))
nc_sub$median_income <- as.numeric(as.character(economic$median_household_income[m]))

#White
a <- summary(nc_sub$median_income[nc_sub$sr.white == 1])[4]
b <- summary(nc_sub$median_income[nc_sub$predicted.white == 1])[4]
c <- summary(nc_sub$median_income[nc_sub$predicted.white.rf == 1])[4]

round(a,0)
round(b,0)
round((b-a)/a*100,2)
round(c,0)
round((c-a)/a*100,2)

#Black
a <- summary(nc_sub$median_income[nc_sub$sr.black == 1])[4]
b <- summary(nc_sub$median_income[nc_sub$predicted.black == 1])[4]
c <- summary(nc_sub$median_income[nc_sub$predicted.black.rf == 1])[4]

round(a,0)
round(b,0)
round((b-a)/a*100,2)
round(c,0)
round((c-a)/a*100,2)

#Latino
a <- summary(nc_sub$median_income[nc_sub$sr.hispanic == 1])[4]
b <- summary(nc_sub$median_income[nc_sub$predicted.latino == 1])[4]
c <- summary(nc_sub$median_income[nc_sub$predicted.latino.rf == 1])[4]

round(a,0)
round(b,0)
round((b-a)/a*100,2)
round(c,0)
round((c-a)/a*100,2)

#Asian
a <- summary(nc_sub$median_income[nc_sub$sr.asian == 1])[4]
b <- summary(nc_sub$median_income[nc_sub$predicted.asian == 1])[4]
c <- summary(nc_sub$median_income[nc_sub$predicted.asian.rf == 1])[4]

round(a,0)
round(b,0)
round((b-a)/a*100,2)
round(c,0)
round((c-a)/a*100,2)


#############################
#HOUSING VALUES
#############################
#get housing value data
housing <- read.csv("data/housing_values.csv")
housing <- subset(housing, select = c("RegionName", "X1.31.18", "X2.28.18", "X3.31.18", "X4.30.18", "X5.31.18", "X6.30.18", "X7.31.18", "X8.31.18", "X9.30.18", "X10.31.18", "X11.30.18", "X12.31.18"))

median.value <- apply(X = housing[,2:ncol(housing)], MARGIN = 1, FUN = function(x) mean(x, na.rm = T))
housing$median.value <- median.value

m <- match(nc_sub$zip, housing$RegionName)
table(is.na(m))

nc_sub$median.house.price <- housing$median.value[m]


#White
a <- summary(nc_sub$median.house.price[nc_sub$sr.white == 1])[4]
b <- summary(nc_sub$median.house.price[nc_sub$predicted.white == 1])[4]
c <- summary(nc_sub$median.house.price[nc_sub$predicted.white.rf == 1])[4]

round(a,0)
round(b,0)
round((b-a)/a*100,2)
round(c,0)
round((c-a)/a*100,2)

#Black
a <- summary(nc_sub$median.house.price[nc_sub$sr.black == 1])[4]
b <- summary(nc_sub$median.house.price[nc_sub$predicted.black == 1])[4]
c <- summary(nc_sub$median.house.price[nc_sub$predicted.black.rf == 1])[4]

round(a,0)
round(b,0)
round((b-a)/a*100,2)
round(c,0)
round((c-a)/a*100,2)

#Latino
a <- summary(nc_sub$median.house.price[nc_sub$sr.hispanic == 1])[4]
b <- summary(nc_sub$median.house.price[nc_sub$predicted.latino == 1])[4]
c <- summary(nc_sub$median.house.price[nc_sub$predicted.latino.rf == 1])[4]

round(a,0)
round(b,0)
round((b-a)/a*100,2)
round(c,0)
round((c-a)/a*100,2)

#Asian
a <- summary(nc_sub$median.house.price[nc_sub$sr.asian == 1])[4]
b <- summary(nc_sub$median.house.price[nc_sub$predicted.asian == 1])[4]
c <- summary(nc_sub$median.house.price[nc_sub$predicted.asian.rf == 1])[4]

round(a,0)
round(b,0)
round((b-a)/a*100,2)
round(c,0)
round((c-a)/a*100,2)


#############################
#DONORS
#############################
donor.actual <- tapply(X=nc_sub$donor[nc_sub$donor == 1], INDEX = nc_sub$sr.race[nc_sub$donor == 1], function(x) length(x)) / sum(nc_sub$donor) * 100
donor.predicted <- tapply(X=nc_sub$donor[nc_sub$donor == 1], INDEX = nc_sub$predicted.race[nc_sub$donor == 1], function(x) length(x)) / sum(nc_sub$donor) * 100
donor.predicted.rf <- tapply(X=nc_sub$donor[nc_sub$donor == 1], INDEX = nc_sub$rf.race[nc_sub$donor == 1], function(x) length(x)) / sum(nc_sub$donor) * 100

round(donor.actual[c(5,2,3,1)], 2)
# White    Black    Hispanic    Asian 
# 84.36    12.32     0.73     1.31
round(donor.predicted[c(6,3,4,2)], 2)
# White    Black    Hispanic    Asian 
# 88.11     8.79     0.89     1.90
round((donor.predicted[c(6,3,4,2)] - donor.actual[c(5,2,3,1)]) / donor.actual[c(5,2,3,1)] * 100, 2)
# White    Black    Hispanic    Asian 
# 4.45   -28.64    21.67    44.94 

round(donor.predicted.rf[c(5,2,3,1)], 2)
# White    Black    Hispanic    Asian 
# 84.20    13.06     0.87     1.85 
round((donor.predicted.rf[c(5,2,3,1)] - donor.actual[c(5,2,3,1)]) / donor.actual[c(5,2,3,1)] * 100, 2)
# White    Black    Hispanic    Asian 
# -0.19     6.00    19.20    40.65 


#############################
#TURNOUT
#############################
#cvap WHITE voters
white.voters.cvap <- sum(cvap.white$cvap_est[substr(cvap.white$GEO_ID, 1, 2) == "37"])
#number of WHITE voters, self-reported
white.voters.sr <- sum(nc_sub$voted.16g[nc_sub$sr.white == 1])
overall.sr.white <- white.voters.sr/white.voters.cvap*100
#number of WHITE voters, BISG
white.voters.bisg <- sum(nc_sub$voted.16g[nc_sub$predicted.white == 1])
overall.bisg.white <- white.voters.bisg/white.voters.cvap*100
#number of WHITE voters, BISG+RF
white.voters.rf <- sum(nc_sub$voted.16g[nc_sub$predicted.white.rf == 1])
overall.rf.white <- white.voters.rf/white.voters.cvap*100

#cvap BLACK voters
black.voters.cvap <- sum(cvap.black$cvap_est[substr(cvap.black$GEO_ID, 1, 2) == "37"])
#number of BLACK voters, self-reported
black.voters.sr <- sum(nc_sub$voted.16g[nc_sub$sr.black == 1])
overall.sr.black <- black.voters.sr/black.voters.cvap*100
#number of BLACK voters, BISG
black.voters.bisg <- sum(nc_sub$voted.16g[nc_sub$predicted.black == 1])
overall.bisg.black <- black.voters.bisg/black.voters.cvap*100
#number of BLACK voters, BISG+RF
black.voters.rf <- sum(nc_sub$voted.16g[nc_sub$predicted.black.rf == 1])
overall.rf.black <- black.voters.rf/black.voters.cvap*100

#cvap latino voters
latino.voters.cvap <- sum(cvap.latino$cvap_est[substr(cvap.latino$GEO_ID, 1, 2) == "37"])
#number of latino voters, self-reported
latino.voters.sr <- sum(nc_sub$voted.16g[nc_sub$sr.hispanic == 1])
overall.sr.latino <- latino.voters.sr/latino.voters.cvap*100
#number of latino voters, BISG
latino.voters.bisg <- sum(nc_sub$voted.16g[nc_sub$predicted.latino == 1])
overall.bisg.latino <- latino.voters.bisg/latino.voters.cvap*100
#number of latino voters, BISG+RF
latino.voters.rf <- sum(nc_sub$voted.16g[nc_sub$predicted.latino.rf == 1])
overall.rf.latino <- latino.voters.rf/latino.voters.cvap*100

#cvap asian voters
asian.voters.cvap <- sum(cvap.asian$cvap_est[substr(cvap.asian$GEO_ID, 1, 2) == "37"])
#number of asian voters, self-reported
asian.voters.sr <- sum(nc_sub$voted.16g[nc_sub$sr.asian == 1])
overall.sr.asian <- asian.voters.sr/asian.voters.cvap*100
#number of asian voters, BISG
asian.voters.bisg <- sum(nc_sub$voted.16g[nc_sub$predicted.asian == 1])
overall.bisg.asian <- asian.voters.bisg/asian.voters.cvap*100
#number of asian voters, BISG+RF
asian.voters.rf <- sum(nc_sub$voted.16g[nc_sub$predicted.asian.rf == 1])
overall.rf.asian <- asian.voters.rf/asian.voters.cvap*100

round(overall.sr.white, 2)
round(overall.bisg.white, 2)
round(100*(overall.bisg.white-overall.sr.white)/overall.sr.white, 2)
round(overall.rf.white, 2)
round(100*(overall.rf.white-overall.sr.white)/overall.sr.white, 2)

round(overall.sr.black, 2)
round(overall.bisg.black, 2)
round(100*(overall.bisg.black-overall.sr.black)/overall.sr.black, 2)
round(overall.rf.black, 2)
round(100*(overall.rf.black-overall.sr.black)/overall.sr.black, 2)

round(overall.sr.latino, 2)
round(overall.bisg.latino, 2)
round(100*(overall.bisg.latino-overall.sr.latino)/overall.sr.latino, 2)
round(overall.rf.latino, 2)
round(100*(overall.rf.latino-overall.sr.latino)/overall.sr.latino, 2)

round(overall.sr.asian, 2)
round(overall.bisg.asian, 2)
round(100*(overall.bisg.asian-overall.sr.asian)/overall.sr.asian, 2)
round(overall.rf.asian, 2)
round(100*(overall.rf.asian-overall.sr.asian)/overall.sr.asian, 2)


#############################
####TRACT DIVERSITY
#############################
nc_sub$tract.largest.race <- ""
nc_sub$tract.largest.race[nc_sub$cvap.pct.white > nc_sub$cvap.pct.black & nc_sub$cvap.pct.white > nc_sub$cvap.pct.latino & nc_sub$cvap.pct.white > nc_sub$cvap.pct.asian] <- "White"
nc_sub$tract.largest.race[nc_sub$cvap.pct.black > nc_sub$cvap.pct.white & nc_sub$cvap.pct.black > nc_sub$cvap.pct.latino & nc_sub$cvap.pct.black > nc_sub$cvap.pct.asian] <- "Black"
nc_sub$tract.largest.race[nc_sub$cvap.pct.latino > nc_sub$cvap.pct.white & nc_sub$cvap.pct.latino > nc_sub$cvap.pct.black & nc_sub$cvap.pct.latino > nc_sub$cvap.pct.asian] <- "Hispanic"
nc_sub$tract.largest.race[nc_sub$cvap.pct.asian > nc_sub$cvap.pct.white & nc_sub$cvap.pct.asian > nc_sub$cvap.pct.black & nc_sub$cvap.pct.asian > nc_sub$cvap.pct.latino] <- "Asian"

nc_sub$local.minority <- 0
nc_sub$local.minority[nc_sub$sr.white == 1 & nc_sub$tract.largest.race != "White"] <- 1
nc_sub$local.minority[nc_sub$sr.black == 1 & nc_sub$tract.largest.race != "Black"] <- 1
nc_sub$local.minority[nc_sub$sr.hispanic == 1 & nc_sub$tract.largest.race != "Hispanic"] <- 1
nc_sub$local.minority[nc_sub$sr.asian == 1 & nc_sub$tract.largest.race != "Asian"] <- 1

nc_sub$local.minority.bisg <- 0
nc_sub$local.minority.bisg[nc_sub$predicted.white == 1 & nc_sub$tract.largest.race != "White"] <- 1
nc_sub$local.minority.bisg[nc_sub$predicted.black == 1 & nc_sub$tract.largest.race != "Black"] <- 1
nc_sub$local.minority.bisg[nc_sub$predicted.latino == 1 & nc_sub$tract.largest.race != "Hispanic"] <- 1
nc_sub$local.minority.bisg[nc_sub$predicted.asian == 1 & nc_sub$tract.largest.race != "Asian"] <- 1

nc_sub$local.minority.rf <- 0
nc_sub$local.minority.rf[nc_sub$predicted.white.rf == 1 & nc_sub$tract.largest.race != "White"] <- 1
nc_sub$local.minority.rf[nc_sub$predicted.black.rf == 1 & nc_sub$tract.largest.race != "Black"] <- 1
nc_sub$local.minority.rf[nc_sub$predicted.latino.rf == 1 & nc_sub$tract.largest.race != "Hispanic"] <- 1
nc_sub$local.minority.rf[nc_sub$predicted.asian.rf == 1 & nc_sub$tract.largest.race != "Asian"] <- 1


minority.actual <- tapply(X=nc_sub$local.minority, INDEX = nc_sub$sr.race, function(x) mean(x))* 100
minority.predicted <- tapply(X=nc_sub$local.minority.bisg, INDEX = nc_sub$predicted.race, function(x) mean(x))* 100
minority.predicted.rf <- tapply(X=nc_sub$local.minority.rf, INDEX = nc_sub$rf.race, function(x) mean(x))* 100


round(minority.actual[c(5,2,3,1)], 2)
# White    Black    Hispanic    Asian 
# 5.73    57.66     100.00    100.00 
round(minority.predicted[c(6,3,4,2)], 2)
# White    Black    Hispanic    Asian 
#  4.20    32.67    100.00      100.00
round((minority.predicted[c(6,3,4,2)] - minority.actual[c(5,2,3,1)]) / minority.actual[c(5,2,3,1)] * 100, 2)
# White    Black    Hispanic    Asian 
# -26.66   -43.34     0.00     0.00

round(minority.predicted.rf[c(5,2,3,1)], 2)
# White    Black    Hispanic    Asian 
# 4.14    50.64     100.00      100.00
round((minority.predicted.rf[c(5,2,3,1)] - minority.actual[c(5,2,3,1)]) / minority.actual[c(5,2,3,1)] * 100, 2)
# White    Black    Hispanic    Asian 
# -27.72   -12.18     0.00     0.00


##################################
#FIGURE A.2
##################################
#BLACK
#proportion black by income of tract
proportion.actual.black.by.tract.income <- tapply(X = nc_sub$sr.black, INDEX = nc_sub$income_rounded, function(x) mean(x, na.rm = T))
proportion.predicted.black.by.tract.income <- tapply(X = nc_sub$predicted.black, INDEX = nc_sub$income_rounded, function(x) mean(x, na.rm = T))
proportion.predicted.rf.black.by.tract.income <- tapply(X = nc_sub$predicted.black.rf, INDEX = nc_sub$income_rounded, function(x) mean(x, na.rm = T))

diff.black.tract.income <- proportion.predicted.black.by.tract.income - proportion.actual.black.by.tract.income
diff.rf.black.tract.income <- proportion.predicted.rf.black.by.tract.income - proportion.actual.black.by.tract.income

point.size <- tapply(X = nc_sub$sr.black, INDEX = nc_sub$income_rounded, length)


m <- loess(diff.black.tract.income ~ names(diff.black.tract.income), weights = point.size, span = .6)
m.forest <- loess(diff.rf.black.tract.income ~ names(diff.rf.black.tract.income), weights = point.size, span = .6)

plot(names(diff.black.tract.income), diff.black.tract.income, ylim = c(-.2, .2), xlab = "Median Income of Census Tract ($1,000)", main = "Black Voters", ylab = "Predicted minus Self Report Proportion", pch = 16, col = "#99999999", axes = F, xlim = c(0, 220000), cex = 0)
abline(h=0, lty = 2, col = "grey80")
lines(x=m$x, y=m$fitted, lwd = 3, col = "dark red")
lines(x=m.forest$x, y=m.forest$fitted, lwd = 2, col = "dark green", lty = 2)
axis(side = 1, labels = seq(0, 220, 20), at = seq(0, 220000, 20000), cex.axis = .9)
axis(side = 2, labels = seq(-.2, .2, .05), at = seq(-.2, .2, .05), las = 2)
box()
text(120000, -.01, "BISG + Random Forest")
text(120000, -.10, "BISG")



#WHITE
#proportion white by income of tract
proportion.actual.white.by.tract.income <- tapply(X = nc_sub$sr.white, INDEX = nc_sub$income_rounded, function(x) mean(x, na.rm = T))
proportion.predicted.white.by.tract.income <- tapply(X = nc_sub$predicted.white, INDEX = nc_sub$income_rounded, function(x) mean(x, na.rm = T))
proportion.predicted.rf.white.by.tract.income <- tapply(X = nc_sub$predicted.white.rf, INDEX = nc_sub$income_rounded, function(x) mean(x, na.rm = T))

diff.white.tract.income <- proportion.predicted.white.by.tract.income - proportion.actual.white.by.tract.income
diff.rf.white.tract.income <- proportion.predicted.rf.white.by.tract.income - proportion.actual.white.by.tract.income

point.size <- tapply(X = nc_sub$sr.white, INDEX = nc_sub$income_rounded, length)


m <- loess(diff.white.tract.income ~ names(diff.white.tract.income), weights = point.size, span = .6)
m.forest <- loess(diff.rf.white.tract.income ~ names(diff.rf.white.tract.income), weights = point.size, span = .6)

plot(names(diff.white.tract.income), diff.white.tract.income, ylim = c(-.2, .2), xlab = "Median Income of Census Tract ($1,000)", main = "White Voters", ylab = "Predicted minus Self Report Proportion", pch = 16, col = "#99999999", axes = F, xlim = c(0, 220000), cex = 0)
abline(h=0, lty = 2, col = "grey80")
lines(x=m$x, y=m$fitted, lwd = 3, col = "dark red")
lines(x=m.forest$x, y=m.forest$fitted, lwd = 3, col = "dark green", lty = 2)
axis(side = 1, labels = seq(0, 220, 20), at = seq(0, 220000, 20000), cex.axis = .9)
axis(side = 2, labels = seq(-.2, .2, .05), at = seq(-.2, .2, .05), las = 2)
box()
text(120000, 0, "BISG + Random Forest")
text(120000, .10, "BISG")



#HISPANIC
#proportion hispanic by income of tract
proportion.actual.hispanic.by.tract.income <- tapply(X = nc_sub$sr.hispanic, INDEX = nc_sub$income_rounded, function(x) mean(x, na.rm = T))
proportion.predicted.hispanic.by.tract.income <- tapply(X = nc_sub$predicted.latino, INDEX = nc_sub$income_rounded, function(x) mean(x, na.rm = T))
proportion.predicted.rf.hispanic.by.tract.income <- tapply(X = nc_sub$predicted.latino.rf, INDEX = nc_sub$income_rounded, function(x) mean(x, na.rm = T))

diff.hispanic.tract.income <- proportion.predicted.hispanic.by.tract.income - proportion.actual.hispanic.by.tract.income
diff.rf.hispanic.tract.income <- proportion.predicted.rf.hispanic.by.tract.income - proportion.actual.hispanic.by.tract.income

point.size <- tapply(X = nc_sub$sr.hispanic, INDEX = nc_sub$income_rounded, length)


m <- loess(diff.hispanic.tract.income ~ names(diff.hispanic.tract.income), weights = point.size, span = .6)
m.forest <- loess(diff.rf.hispanic.tract.income ~ names(diff.rf.hispanic.tract.income), weights = point.size, span = .6)

plot(names(diff.hispanic.tract.income), diff.hispanic.tract.income, ylim = c(-.2, .2), xlab = "Median Income of Census Tract ($1,000)", main = "Hispanic Voters", ylab = "Predicted minus Self Report Proportion", pch = 16, col = "#99999999", axes = F, xlim = c(0, 220000), cex = 0)
abline(h=0, lty = 2, col = "grey80")
lines(x=m$x, y=m$fitted, lwd = 3, col = "dark red")
lines(x=m.forest$x, y=m.forest$fitted, lwd = 3, col = "dark green", lty = 2)
axis(side = 1, labels = seq(0, 220, 20), at = seq(0, 220000, 20000), cex.axis = .9)
axis(side = 2, labels = seq(-.2, .2, .05), at = seq(-.2, .2, .05), las = 2)
box()
text(60000, 0.03, "BISG + Random Forest")
text(60000, -0.01, "BISG")



#ASIAN
#proportion asian by income of tract
proportion.actual.asian.by.tract.income <- tapply(X = nc_sub$sr.asian, INDEX = nc_sub$income_rounded, function(x) mean(x, na.rm = T))
proportion.predicted.asian.by.tract.income <- tapply(X = nc_sub$predicted.asian, INDEX = nc_sub$income_rounded, function(x) mean(x, na.rm = T))
proportion.predicted.rf.asian.by.tract.income <- tapply(X = nc_sub$predicted.asian.rf, INDEX = nc_sub$income_rounded, function(x) mean(x, na.rm = T))

diff.asian.tract.income <- proportion.predicted.asian.by.tract.income - proportion.actual.asian.by.tract.income
diff.rf.asian.tract.income <- proportion.predicted.rf.asian.by.tract.income - proportion.actual.asian.by.tract.income

point.size <- tapply(X = nc_sub$sr.asian, INDEX = nc_sub$income_rounded, length)


m <- loess(diff.asian.tract.income ~ names(diff.asian.tract.income), weights = point.size, span = .6)
m.forest <- loess(diff.rf.asian.tract.income ~ names(diff.rf.asian.tract.income), weights = point.size, span = .6)

plot(names(diff.asian.tract.income), diff.asian.tract.income, ylim = c(-.2, .2), xlab = "Median Income of Census Tract ($1,000)", main = "Asian Voters", ylab = "Predicted minus Self Report Proportion", pch = 16, col = "#99999999", axes = F, xlim = c(0, 220000), cex = 0)
abline(h=0, lty = 2, col = "grey80")
lines(x=m$x, y=m$fitted, lwd = 3, col = "dark red")
lines(x=m.forest$x, y=m.forest$fitted, lwd = 3, col = "dark green", lty = 2)
axis(side = 1, labels = seq(0, 220, 20), at = seq(0, 220000, 20000), cex.axis = .9)
axis(side = 2, labels = seq(-.2, .2, .05), at = seq(-.2, .2, .05), las = 2)
box()
text(160000, 0.03, "BISG + Random Forest")
text(160000, 0.01, "BISG")

